### ###################################################################
### Take in tidy by-election data and tidy polling data and merges them
### Lots of variable standardization performed here
### Output is tidy_model_data.rds")
### ###################################################################

here::i_am("newR/03_combine_data.R")

library(tidyverse)
library(here)

### Read in the outcomes
dat <- readRDS(here::here("working",
                          "tidy_by_elex_data.rds"))

### Read in the polls
polls <- readRDS(here::here("working",
                            "tidy_polls.rds"))

nrow(dat)
dat <- merge(dat, polls,
             by.x = "Date",
             by.y = "date",
             all.x = TRUE,
             all.y = FALSE)
nrow(dat)

### Merge in the inflation data
capwords <- function(s, strict = FALSE) {
    cap <- function(s) paste(toupper(substring(s, 1, 1)),
                  {s <- substring(s, 2); if(strict) tolower(s) else s},
                             sep = "", collapse = " " )
    sapply(strsplit(s, split = " "), cap, USE.NAMES = !is.null(names(s)))
}

infl <- read.csv(here::here("data",
                            "series-130820.csv"),
                 skip = 367,
                 header = F) %>%
    dplyr::select(date = V1, RPI = V2) %>%
    mutate(date = paste(date, " 01")) %>% 
    mutate(date = as.Date(capwords(tolower(date)), format = "%Y %B %d"))

infl <- merge(infl,
              dat %>% distinct(Date),
              by.x = "date",
              by.y = "Date",
              all = TRUE)

infl <- infl %>%
    arrange(date) %>%
    fill(RPI, .direction = "down")

dat <- merge(dat, infl,
             by.x = "Date",
             by.y = "date",
             all.x = TRUE,
             all.y = FALSE)

### Handling of candidacy decisions
dat <- dat %>%
    mutate(ConCandChg = ConCand_BE - ConCand_GE,
           LabCandChg = LabCand_BE - LabCand_GE,
           LibCandChg = LibCand_BE - LibCand_GE,
           NatCandChg = NatCand_BE - NatCand_GE,
           OthCandChg = OthCand_BE - OthCand_GE,
           ConEntry = ConCandChg > 0,
           LabEntry = LabCandChg > 0,
           LibEntry = LibCandChg > 0,
           NatEntry = NatCandChg > 0,
           OthEntry = OthCandChg > 0,
           ConWithdrawal = ConCandChg < 0,
           LabWithdrawal = LabCandChg < 0,
           LibWithdrawal = LibCandChg < 0,
           NatWithdrawal = NatCandChg < 0,
           OthWithdrawal = OthCandChg < 0)

dat <- dat %>%
    mutate(CandChanges = abs(ConCandChg) +
               abs(LabCandChg) + abs(LibCandChg) +
               abs(NatCandChg) + abs(OthCandChg))

cand <- read.csv(here::here("data",
                            "candidacy.csv")) %>%
    filter(IncCandidate != "") %>% 
    dplyr::select(Date, Name, IncCandidate)
tmp <- model.matrix(~IncCandidate, data = cand)[,-1]
tmp[which(is.na(tmp))] <- 0
cand <- cbind(cand %>% dplyr::select(-IncCandidate), tmp)

nrow(dat)
dat <- merge(dat, cand,
             by = c("Date", "Name"),
             all.x = TRUE,
             all.y = FALSE)
nrow(dat)

### Season
dat <- dat %>%
    mutate(month = format(Date, "%b")) %>%
    mutate(winter = month %in% c("Nov", "Dec", "Jan", "Feb"))

### Treatment of missing data?
### First, polls: zero replace
dat <- dat %>%
    mutate_at(vars(ends_with("PollChg")), coalesce, 0) %>%
    mutate_at(vars(ends_with("PollChgLR")), coalesce, 0) %>%
    mutate(npolls = coalesce(npolls, 0))

### Second, inflation: carry backwards
dat <- dat %>%
    arrange(Date) %>%
    fill(RPI, .direction = "up")

### Third, candidacy
dat <- dat %>%
    mutate_at(vars(starts_with("IncCand")), coalesce, 0)

### Make sure percentage are in range [0, 1]
dat <- dat %>%
    mutate_at(vars(Con_BE, Con_GE,
                   Lab_BE, Lab_GE,
                   Lib_BE, Lib_GE,
                   Nat_BE, Nat_GE,
                   Oth_BE, Oth_GE,
                   Oth2_BE, Oth2_GE,
                   Turn_at_GE),
              function(x)x/100)


### Start scaling everything
dat <- dat %>%
    dplyr::mutate(dplyr::across(c(Lab_GE, Lib_GE, Nat_GE, Oth_GE,
                                  LabPollChg, LibPollChg, NatPollChg, OthPollChg,
                                  winter, LabGovt, LibGovt,
                                  RPI, npolls, Turn_at_GE,
                                  LabInc, LibInc, NatInc, OthInc, 
                                  ConCandChg, LabCandChg,
                                  LibCandChg, NatCandChg, OthCandChg,
                                  ConCand_BE, LabCand_BE,
                                  LibCand_BE, NatCand_BE, OthCand_BE,
                                  ConCand_GE, LabCand_GE,
                                  LibCand_GE, NatCand_GE, OthCand_GE,
                                  IncCandidateCon, IncCandidateLab,
                                  IncCandidateLib, IncCandidateOther),
                                list(mean = mean, sd = sd, sc = scale)))


### Now scale everything to have same global mean and SD
ge_vars <- c("Lab_GE", "Lib_GE", "Nat_GE", "Oth_GE")
ge_mean <- mean(unlist(dat[,ge_vars]), na.rm = TRUE)
ge_sd <- sd(unlist(dat[,ge_vars]), na.rm = TRUE)
for (v in ge_vars) {
    newvar <- paste0(v, "_sc2")
    dat[,newvar] <- (dat[,v] - ge_mean) / ge_sd
}

pollchg_vars <- c("LabPollChg", "LibPollChg", "NatPollChg", "OthPollChg")
pollchg_mean <- mean(unlist(dat[,pollchg_vars]), na.rm = TRUE)
pollchg_sd <- sd(unlist(dat[,pollchg_vars]), na.rm = TRUE)
for (v in pollchg_vars) {
    newvar <- paste0(v, "_sc2")
    dat[,newvar] <- (dat[,v] - pollchg_mean) / pollchg_sd
}

## Save the output
saveRDS(dat,
        file = here::here("working",
                          "tidy_model_data.rds"))

